# # R Code for Handout 4 - Tree-based Regression # # # For the preliminary example read in the file Ozone.csv from the workshop # webpage - http://course1.winona.edu/bdeppa/dsciwork.html # Ozdata = read.table(file.choose(),header=T,sep=",") library(rpart) library(rpart.plot) oz.rpart <- rpart(upoz ~ inbh + safb,data=Ozdata) summary(oz.rpart) plot(oz.rpart) text(oz.rpart) post(oz.rpart,"Regression Tree for Upper Ozone Concentration") x1 = seq(min(Ozdata$inbh),max(Ozdata$inbh),length=100) x2 = seq(min(Ozdata$safb),max(Ozdata$safb),length=100) x = expand.grid(inbh=x1,safb=x2) ypred = predict(oz.rpart,newdata=x) persp(x1,x2,z=matrix(ypred,100,100),theta=45,xlab="INBH",ylab="SAFB",zlab="UPOZ") plot(oz.rpart,uniform=T,branch=1,compress=T,margin=0.05,cex=.5) text(oz.rpart,all=T,use.n=T,fancy=T,cex=.7) title(main="Regression Tree for Upper Ozone Concentration") require(rpart.plot) prp(oz.rpart,main="Tree for Upper Ozone Concentration",type=4) # Example 1 - Diamonds data Diamonds = read.table(file.choose(),header=T,sep=",") names(Diamonds) table(Diamonds$Test) # We will use these again so save them diam.train = Diamonds[Diamonds$Test==0,-10] diam.valid = Diamonds[Diamonds$Test==1,-10] diam.test = Diamonds[Diamonds$Test==2,-10] tree1 = rpart(Price~.,data=diam.train) prp(tree1,type=3,main="Regression Tree for Diamond Prices") summary(tree1) par(mfrow=c(1,2)) plot(diam.train$Price,predict(tree1),xlab="Actual Price",ylab="Fitted Values") abline(0,1) plot(predict(tree1),resid(tree1),xlab="Fitted Values",ylab="Residuals") abline(h=0) par(mfrow=c(1,1)) tree2 = rpart(Price~.,data=diam.train,control=rpart.control(cp=.0001)) prp(tree2,main="Large Regression Tree for Diamond Prices",cex=0.5) par(mfrow=c(1,2)) plot(diam.train$Price,predict(tree2),xlab="Actual Price",ylab="Fitted Price") abline(0,1,lwd=2,col=”blue”) plot(predict(tree2),resid(tree2),xlab=”Fitted Values”,ylab="Residuals") abline(h=0,lwd=2,col="red") par(mfrow=c(1,1)) PredAcc = function(y,ypred){ RMSEP = sqrt(mean((y-ypred)^2)) MAE = mean(abs(y-ypred)) MAPE = mean(abs(y-ypred)/y)*100 cat("RMSEP\n") cat("===============\n") cat(RMSEP,"\n\n") cat("MAE\n") cat("===============\n") cat(MAE,"\n\n") cat("MAPE\n") cat("===============\n") cat(MAPE,"\n\n") return(data.frame(RMSEP=RMSEP,MAE=MAE,MAPE=MAPE)) } ypred = predict(tree1,newdata=diam.valid) PredAcc(diam.valid$Price,ypred) ypred2 = predict(tree2,newdata=diam.valid) PredAcc(diam.valid$Price,ypred2) bestMLR = lm(log(Price)~poly(Carats,3)+Clarity*Color + Cut + TDdiff + TDratio,data=diam.train) ypredlog = predict(bestMLR,newdata=diam.valid) ypredMLR = exp(ypredlog) PredAcc(diam.valid$Price,ypredMLR) tree3 = rpart(log(Price)~.,data=diam.train,cp=.0001) ypredlog = predict(tree3,newdata=diam.valid) ypred3 = exp(ypredlog) PredAcc(diam.valid$Price,ypred3) tree4 = rpart(log(Price)~.,data=diam.train,cp=.0001) plotcp(tree4) tree.opt = rpart(log(Price)~.,data=diam.train,cp=.00058) ypredlog = predict(tree.opt,newdata=diam.valid) ypred = exp(ypredlog) PredAcc(diam.valid$Price,ypred) ?rpart.control tree.opt = rpart(log(Price)~.,data=diam.train,control=rpart.control(cp=.00005,minsplit=5)) ypredlog = predict(tree.opt,newdata=diam.valid) ypred = exp(ypredlog) PredAcc(diam.valid$Price,ypred) rpart.sscv = function(fit,data,p=.667,B=100, cp=fit$control$cp,minsplit=fit$control$minsplit) { MSE = rep(0,B) MAE = rep(0,B) MAPE = rep(0,B) y = fit$y n = nrow(data) ss <- floor(n*p) for (i in 1:B) { sam = sample(1:n,ss,replace=F) fit2 = rpart(formula(fit),data=data[sam,],cp=cp,minsplit=minsplit) ynew = predict(fit2,newdata=data[-sam,]) MSE[i] = mean((y[-sam]-ynew)^2) MAE[i] = mean(abs(y[-sam]-ynew)) MAPE[i] = mean((abs(y[-sam]-ynew)/y[-sam])) } RMSEP = sqrt(mean(MSE)) MAEP = mean(MAE) MAPEP = mean(MAPE) cat("RMSEP\n") cat("===============\n") cat(RMSEP,"\n\n") cat("MAEP\n") cat("===============\n") cat(MAEP,"\n\n") cat("MAPEP\n") cat("===============\n") cat(MAPEP,"\n\n") temp = data.frame(MSEP=MSE,MAEP=MAE,MAPEP=MAPE) return(temp) } rpart.logsscv = function(fit,data,p=.667,B=100, cp=fit$control$cp,minsplit=fit$control$minsplit) { MSE = rep(0,B) MAE = rep(0,B) MAPE = rep(0,B) y = exp(fit$y) n = nrow(data) ss <- floor(n*p) for (i in 1:B) { sam = sample(1:n,ss,replace=F) fit2 = rpart(formula(fit),data=data[sam,],cp=cp,minsplit=minsplit) ynew = exp(predict(fit2,newdata=data[-sam,])) MSE[i] = mean((y[-sam]-ynew)^2) MAE[i] = mean(abs(y[-sam]-ynew)) MAPE[i] = mean((abs(y[-sam]-ynew)/y[-sam]))*100 } RMSEP = sqrt(mean(MSE)) MAEP = mean(MAE) MAPEP = mean(MAPE) cat("RMSEP\n") cat("===============\n") cat(RMSEP,"\n\n") cat("MAEP\n") cat("===============\n") cat(MAEP,"\n\n") cat("MAPEP\n") cat("===============\n") cat(MAPEP,"\n\n") temp = data.frame(MSEP=MSE,MAEP=MAE,MAPEP=MAPE) return(temp) } # # Read in the Diamonds.csv file again! # Diamonds = read.table(file.choose(),header=T,sep=",") Diamonds = Diamonds[,-10] tree1 = rpart(log(Price)~.,data=Diamonds) results = rpart.logsscv(tree1,data=Diamonds,cp=.00005,minsplit=5) results = rpart.logsscv(tree1,data=Diamonds,cp=.0005,minsplit=5) results = rpart.logsscv(tree1,data=Diamonds,cp=.000025,minsplit=5) results = rpart.logsscv(tree1,data=Diamonds,cp=.00005,minsplit=3) # Combine the training and validation sets to fit the final model # before predicting the test diamonds. # temp = rbind(diam.train,diam.valid) rpart.final = rpart(log(Price)~.,data=temp,cp=.00005,minsplit=5) # Predict for the test cases using the model fit to the combined # training and validation sets. # ypredlog = predict(rpart.final,newdata=diam.test) ypred.test = exp(ypredlog) # # Tasks - you will need to read in the ChiHomes(train).csv and # ChiHomes(test).csv files. # library(rpart) library(rpart.plot) ChiTrain = read.table(file.choose(),header=T,sep=",") ChiTest = read.table(file.choose(),header=T,sep=",") ChiTrain2 = ChiTrain[,-3] ChiTest2 = ChiTest[,-3] chi.rpart = rpart(log(ListPrice)~.,data=ChiTrain2) par(mfrow=c(1,1)) plot(chi.rpart) text(chi.rpart) prp(chi.rpart,type=4,digits=4) # # Code for Task 3 on pg. 20 # Diamonds = read.table(file.choose(),header=T,sep=",") library(rpart) library(rpart.plot) fit = rpart(log(Price)~.,data=Diamonds) tree.vary = function(fit,data) { n = nrow(data) sam = sample(1:n,floor(n*.5),replace=F) temp = rpart(formula(fit),data=data[sam,]) prp(temp,type=4,digits=3) } tree.vary(fit,data=Diamonds) tree.vary(fit,data=Diamonds) tree.vary(fit,data=Diamonds) # Etc. library(ipred) diam.train = Diamonds[Diamonds$Test==0,-10] diam.valid = Diamonds[Diamonds$Test==1,-10] diam.test = Diamonds[Diamonds$Test==2,-10] diam.bag = bagging(log(Price)~.,data=diam.train,coob=T,nbagg=10, control=rpart.control(cp=.005,minsplit=5,xval=0)) diam.bag ypredlog = predict(diam.bag,newdata=diam.valid) ypred = exp(ypredlog) PredAcc(diam.valid$Price,ypred) diam.bag2 = bagging(log(Price)~.,data=diam.train,coob=T,nbagg=25, control=rpart.control(cp=.00005,minsplit=5,xval=0)) diam.bag2 ypredlog = predict(diam.bag2,newdata=diam.valid) ypred = exp(ypredlog) PredAcc(diam.valid$Price,ypred) diam.bag2 = bagging(log(Price)~.,data=diam.train,coob=T,nbagg=100, control=rpart.control(cp=.00005,minsplit=5,xval=0)) diam.bag2 ypredlog = predict(diam.bag2,newdata=diam.valid) ypred = exp(ypredlog) PredAcc(diam.valid$Price,ypred) diam.bag2 = bagging(log(Price)~.,data=diam.train,coob=T,nbagg=1000, control=rpart.control(cp=.00005,minsplit=5,xval=0)) diam.bag2 ypredlog = predict(diam.bag2,newdata=diam.valid) ypred = exp(ypredlog) PredAcc(diam.valid$Price,ypred) # WARNING DON'T RUN THE MODEL BELOW (VERY VERY SLOW)! diam.bag2 = bagging(log(Price)~.,data=diam.train,coob=T,nbagg=10000, control=rpart.control(cp=.00005,minsplit=5,xval=0)) diam.bag2 ypredlog = predict(diam.bag2,newdata=diam.valid) ypred = exp(ypredlog) PredAcc(diam.valid$Price,ypred) # # Split-sample CV for Bagged RPART models # bag.sscv = function(fit,data,p=.667,M=100,B=25, cp=fit$control$cp,minbucket=fit$control$minbucket,minsplit=fit$control$minsplit) { OOBMSE = rep(0,M) MSE = rep(0,M) MAE = rep(0,M) MAPE = rep(0,M) y = fit$y n = nrow(data) ss <- floor(n*p) for (i in 1:M) { sam = sample(1:n,ss,replace=F) fit2 = bagging(formula(fit),data=data[sam,],nbagg=B,coob=T, control=rpart.control(cp=cp, minbucket=minbucket, minsplit=minsplit, xval=0)) ynew = predict(fit2,newdata=data[-sam,]) OOBMSE[i] = fit2$err MSE[i] = mean((y[-sam]-ynew)^2) MAE[i] = mean(abs(y[-sam]-ynew)) MAPE[i] = mean((abs(y[-sam]-ynew)/y[-sam])) } OOB.RMSEP = mean(OOBMSE) RMSEP = sqrt(mean(MSE)) MAEP = mean(MAE) MAPEP = mean(MAPE) cat("OOB RMSEP\n") cat("===============\n") cat(OOB.RMSEP,"\n\n") cat("RMSEP\n") cat("===============\n") cat(RMSEP,"\n\n") cat("MAE\n") cat("===============\n") cat(MAEP,"\n\n") cat("MAPE\n") cat("===============\n") cat(MAPEP,"\n\n") temp = data.frame(OOB.RMSEP=OOBMSE,MSEP=MSE,MAEP=MAE,MAPEP=MAPE) return(temp) } fit = rpart(log(Price)~.,data=Diamonds,cp=.00005,minsplit=5) bag.logsscv = edit(bag.logsscv) results = bag.logsscv(fit,Diamonds,M=25,B=50) # # Tasks with Chicago Homes Data - you may need to read in the # the two .csv files for these data ChiHomes(train) and ChiHomes(test). # library(rpart) library(rpart.plot) library(ipred) ChiTrain = read.table(file.choose(),header=T,sep=",") ChiTest = read.table(file.choose(),header=T,sep=",") ChiTrain2 = ChiTrain[,-3] ChiTest2 = ChiTest[,-3] # # You will need to replace the ?? in the code below with some values!!! # fit = rpart(log(ListPrice)~.,data=ChiTrain2,cp=??,minsplit=??) home.bag = bagging(log(ListPrice)~.,data=ChiTrain2,coob=T,nbagg=??, control=rpart.control(cp=??,minsplit=??,xval=0)) ypredlog = predict(home.bag,newdata=ChiTest2) ypred = exp(ypredlog) PredAcc(ChiTest2$ListPrice,ypred) # # RANDOM FORESTS - we will be reading in new data sets # Solubility(train).csv and Solubility(test).csv. # Solu.train = read.table(file.choose(),header=T,sep=",") Solu.test = read.table(file.choose(),header=T,sep=”",") names(Solu.train) str(Solu.train) head(Solu.train,3) dim(Solu.train) dim(Solu.test) solu.rf = randomForest(log10sol~.,data=Solu.train) solu.rf plot(Solu.train$log10sol,predict(solu.rf),xlab="Actual Log10(Solubility)", ylab="Predicted Log10(Solubility)") abline(0,1,lwd=3,col="blue") # # Split-sample CV for random forests. # rf.sscv = function(fit,data,p=.667,B=100,mtry=fit$mtry,ntree=fit$ntree) { MSE = rep(0,B) MAE = rep(0,B) MAPE = rep(0,B) y = fit$y n = nrow(data) ss <- floor(n*p) for (i in 1:B) { sam = sample(1:n,ss,replace=F) fit2 = randomForest(formula(fit),data=data[sam,],mtry=mtry,ntree=ntree) ynew = predict(fit2,newdata=data[-sam,]) MSE[i] = mean((y[-sam]-ynew)^2) MAE[i] = mean(abs(y[-sam]-ynew)) MAPE[i] = mean((abs(y[-sam]-ynew)/y[-sam])) } RMSEP = sqrt(mean(MSE)) MAEP = mean(MAE) MAPEP = mean(MAPE) cat("RMSEP\n") cat("===============\n") cat(RMSEP,"\n\n") cat("MAE\n") cat("===============\n") cat(MAEP,"\n\n") cat("MAPE\n") cat("===============\n") cat(MAPEP,"\n\n") temp = data.frame(MSEP=MSE,MAEP=MAE,MAPEP=MAPE) return(temp) } # # MAPE will be Inf for these examples because the actual # log10(Solubility) will be 0, i.e. Solubility = 1, for some # of the cases. solu.rf = randomForest(log10sol~.,data=Solu.train,mtry=20,ntree=500) results = rf.sscv(solu.rf,data=Solu.train,B=20) solu.rf = randomForest(log10sol~.,data=Solu.train,mtry=30,ntree=500) results = rf.sscv(solu.rf,data=Solu.train,B=20) solu.rf = randomForest(log10sol~.,data=Solu.train,mtry=40,ntree=500) results = rf.sscv(solu.rf,data=Solu.train,B=20) solu.rf = randomForest(log10sol~.,data=Solu.train,mtry=50,ntree=500) results = rf.sscv(solu.rf,data=Solu.train,B=20) solu.rf = randomForest(log10sol~.,data=Solu.train,mtry=60,ntree=500) results = rf.sscv(solu.rf,data=Solu.train,B=20) solu.rf = randomForest(log10sol~.,data=Solu.train,mtry=70,ntree=500) results = rf.sscv(solu.rf,data=Solu.train,B=20) solu.rf = randomForest(log10sol~.,data=Solu.train,mtry=80,ntree=500) results = rf.sscv(solu.rf,data=Solu.train,B=20) solu.rf = randomForest(log10sol~.,data=Solu.train,mtry=90,ntree=500) results = rf.sscv(solu.rf,data=Solu.train,B=20) # Your best mtry might not be mtry = 80, if not you can certainly use # that value instead in the code below. # solu.final = randomForest(log10sol~.,data=Solu.train,mtry=80,ntree=500) solu.final varImpPlot(solu.final) par(mfrow=c(2,2)) partialPlot(solu.final,Solu.train,MolWeight) partialPlot(solu.final,Solu.train,NumCarbon) partialPlot(solu.final,Solu.train,NumNonHBonds) partialPlot(solu.final,Solu.train,SurfaceArea1) par(mfrow=c(1,1)) plotmo(solu.final) ypred = predict(solu.final,newdata=Solu.test) PredAcc(Solu.test$log10sol,ypred) # # Tasks - read in the ChiHomes(train) and ChiHomes # library(randomForest) library(plotmo) ChiTrain = read.table(file.choose(),header=T,sep=",") ChiTest = read.table(file.choose(),header=T,sep=",") ChiTrain2 = ChiTrain[,-3] ChiTest2 = ChiTest[,-3] home.rf = randomForest(log(ListPrice)~.,data=ChiTrain2) home.rf varImpPlot(home.rf) par(mfrow=c(2,2)) partialPlot(home.rf,ChiTrain2,ImputedSQFT) partialPlot(home.rf,ChiTrain2,BEDS) partialPlot(home.rf,ChiTrain2,LATITUDE) partialPlot(home.rf,ChiTrain2,LONGITUDE) par(mfrow=c(1,1)) ypredlog = predict(home.rf,newdata=ChiTest2) ypred = exp(ypredlog) PredAcc(ChiTest2$ListPrice,ypred) # # Boosted Trees - read in the Diamonds.csv data again! # Diamonds = read.table(file.choose(),header=T,sep=",") diam.train = Diamonds[Diamonds$Test==0,-10] diam.valid = Diamonds[Diamonds$Test==1,-10] diam.test = Diamonds[Diamonds$Test==2,-10] diam.gbm = gbm(log(Price)~.,data=diam.train,distribution="gaussian", n.trees=5000,shrinkage=.01,interaction.depth=4, bag.fraction=0.5,train.fraction=.8,n.minobsinnode=5,cv.folds=5, keep.data=T,verbose=T) diam.gbm gbm.perf(diam.gbm,method="OOB") gbm.perf(diam.gbm,method="test") gbm.perf(diam.gbm,method="cv") ypred = predict(diam.gbm,newdata=diam.train,n.trees=3357) plot(log(diam.train$Price),ypred,xlab="Actual log(Price)",ylab="Fitted log(Price)") abline(0,1,lwd=3,col="blue") ypred = exp(ypred) plot(diam.train$Price,ypred,xlab="Actual Price ($)",ylab="Fitted Price ($)") abline(0,1,lwd=3,col="blue") diam.gbm = gbm(formula = log(Price) ~ ., distribution = "gaussian", data = diam.train, n.trees = 100000, interaction.depth = 1, n.minobsinnode = 5, shrinkage = 0.0025, bag.fraction = 0.5, train.fraction = 0.8, cv.folds = 5, keep.data = T, verbose = F) ypred = predict(diam.gbm,newdata=diam.train,n.trees=40339) plot(log(diam.train$Price),ypred,xlab="Actual log(Price)",ylab="Fitted log(Price)") abline(0,1,lwd=3,col="blue") ypred = exp(ypred) plot(diam.train$Price,ypred,xlab="Actual Price ($)",ylab="Fitted Price ($)") abline(0,1,lwd=3,col="blue") PredAcc(ypred,diam.valid$Price) diam.mlr = lm(log(Price)~poly(Carats,3)+Clarity*Color + Cut + TDdiff + TDratio, data=diam.train) ypred = predict(diam.mlr,newdata=diam.valid) ypred = exp(ypred) PredAcc(ypred,diam.valid$Price) # Comparing the methods for predicting the test cases. # ypred = predict(diam.gbm,newdata=diam.test,n.trees=40923) plot(log(diam.test$Price),ypred,xlab="Actual log(Price)",ylab="Predicted log(Price)",main="Predictions for Test Diamonds") abline(0,1,lwd=3,col="blue") ypred = exp(ypred) plot(diam.test$Price,ypred,xlab="Actual Price ($)",ylab="Predicted Price ($)",main="Predictions for Test Diamonds") abline(0,1,lwd=3,col="blue") # # Solubility data with boosted trees. Read the training and test sets again. # Solubility(train).csv and Solubility(test).csv. # Solu.train = read.table(file.choose(),header=T,sep=",") Solu.test = read.table(file.choose(),header=T,sep=",") # Multiple Linear Regression model - stepwise fit take a LONG time! solu.mlr = lm(log10sol~.,data=Solu.train) solu.step = step(solu.mlr) ypred = predict(solu.step,newdata=Solu.test) PredAcc2(Solu.test$log10sol,ypred) # # Random Forest # library(randomForest) solu.rf = randomForest(log10sol~.,data=Solu.train,mtry=80,ntree=500) solu.rf # # Boosted Trees # library(gbm) solu.gbm = gbm(log10sol~.,data=Solu.train,distribution="gaussian", n.trees=10000,shrinkage=.05,interaction.depth=5,bag.fraction=0.5,train.fraction=.8, n.minobsinnode=5,cv.folds=5,keep.data=T,verbose=F) solu.gbm ypred = predict(solu.gbm,newdata=Solu.test,n.trees=769) PredAcc(Solu.test$log10sol,ypred) PredAcc(Solu.test$log10sol,ypred) # # Cubist Models # library(Cubist) names(diam.train) X = diam.train[,-1] y = log(diam.train$Price) # # Cubist takes the predictors in a matrix/data frame X and response in vector y, i.e. we # cannot use y ~ . notation. # diam.cubist = cubist(X,y) summary(diam.cubist) diam.cubist2 = cubist(X,y,committees=10) ypred = predict(diam.cubist2,newdata=X) plot(log(diam.train$Price),ypred,xlab="Actual log(Price)",ylab="Predict log(Price)") abline(0,1,lwd=3,col="blue") title(main="Fitted Values for Training Data") ypred = predict(diam.cubist2,newdata=X) ypred = exp(ypred) plot(diam.train$Price,ypred,xlab="Actual Price ($)",ylab="Fitted Price ($)”) abline(0,1,lwd=3,col="blue") title(main="Fitted Values for Training Data") Xvalid = diam.valid[,-1] ypredlog = predict(diam.cubist2,newdata=Xvalid) ypred = exp(ypredlog) PredAcc(diam.valid$Price,ypred) # # Using nearest neighbors finishing touch. # ypredlog = predict(diam.cubist2,newdata=Xvalid,neighbors=9) ypred = exp(ypredlog) PredAcc(diam.valid$Price,ypred) # # TASKS - Solubility data # Read in the Solubility(train).csv and Solubility(test).csv files. # library(Cubist) Solu.train = read.table(file.choose(),header=T,sep=",") Solu.test = read.table(file.choose(),header=T,sep=",") Xtrain = Solu.train[,-1] Ytrain = Solu.train[,1] Xtest = Solu.test[,-1] ytest = Solu.test[,1] # # Replace the extra stuff the commands below by something appropriate! # solu.cub = cubist(Xtrain,ytrain,extra stuff) summary(solu.cub) ypred = predict(solu.cub,newdata=Xtest,extra stuff) PredAcc(ytest,ypred)